#coding=utf8

"""
Creating dictionary of interwikis from dump.
Creating dictionary of redirects from freebase wikipedia extraction TSV.
"""

import xml.sax
import lexer
import re
import sqlite3
import csv # for importing csv tables
import codecs
import sys
import filedict


def utf_8_encoder(unicode_csv_data):
	for line in unicode_csv_data:
		yield line.encode('utf-8')

def unicode_csv_reader(unicode_csv_data, dialect=csv.excel, **kvargs):
	csv_reader = csv.reader(utf_8_encoder(unicode_csv_data), dialect=dialect, **kvargs)
	for row in csv_reader:
		yield [unicode(cell, 'utf-8') for cell in row]

def get_tsv_reader(filename):
	return unicode_csv_reader(codecs.open(filename,"rb",'utf-8'),delimiter='\t')

#database.read_TSV_redirects("freebase-wex-2010-12-16-redirects.tsv")

rtsv = get_tsv_reader("freebase-wex-redirects.tsv")

redirs = filedict.FileDict("dictionaries/enredirects2")
iwikis = filedict.FileDict("dictionaries/iwikis")
redirsbuf = {}
i = 0
for row in rtsv:
	i +=1
	if (len(row)<3): continue
	if (i % 10000 == 0):
		with redirs.batch as fd:
			for k,v in redirsbuf.iteritems():
				fd[k]=v
		print i
		redirsbuf = {}
	if row[2] in iwikis:
		redirsbuf[row[1]] = row[2]
sys.exit()

iwikisbuf = {}
		
total = 0
redirects = 0
translated = 0

### XML FILE HANDLING ###

class Idict(dict):
	def __missing__(self,key):
		return ""

class Handler(xml.sax.handler.ContentHandler):
	def __init__(self):
		self.content=[]
		self.intag=""
		self.iwikis = Idict()
		self.redirect = False

	def startElement(self,name,attrs):
		self.intag=name
		if name=="redirect":
			self.redirect = True
	def endElement(self,name):
		global total,redirects,translated,iwikis, iwikisbuf
		if name=="text":
			lex = re.findall(lexer.iwiki_regex,"".join(self.content))
			for i in lex:
				self.iwikis[i[0]] = i[1]
			to = lexer.redirect_to("".join(self.content))
			if to:
				self.redirect = to
		if name=="page":
			#print self.iwikis['uk']
			total +=1
			if self.redirect:
				#print "R:",self.redirect
				redirects +=1
			else:
				if self.iwikis['en']!='':
					translated+=1
					iwikisbuf[self.iwikis['en']]=self.iwikis['uk']
			if (total % 1000 == 0):
				with iwikis.batch as fdict:
					for k,v in iwikisbuf.iteritems():
						fdict[k] = v
				iwikisbuf = {}
				print 'total: %d, redirects: %d, translated: %d' % (total,redirects,translated)

			self.redirect = False
			self.iwikis=Idict()
		self.intag=""
		self.content=[]

	def characters(self,content):
		if self.intag=="title":
			self.iwikis['uk'] = content
		elif self.intag=="text":
			self.content.append(content)
handler = Handler()

def parsedump(dumpname):
	xml.sax.parse(dumpname,handler)

parsedump("ukwiki_dump.xml")

# BELOW IS EXAMPLE OF DUMP FILE STRUCTURE 
test = '''<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.4/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.4/ http://www.mediawiki.org/xml/export-0.4.xsd" version="0.4" xml:lang="uk">
  <page>
    <title>Esperanto</title>
    <id>2</id>
    <redirect />
    <revision>
      <id>61122</id>
      <timestamp>2004-01-31T09:34:04Z</timestamp>
      <contributor>
        <ip>203.109.249.138</ip>
      </contributor>
      <text xml:space="preserve">#REDIRECT [[Есперанто]]</text>
    </revision>
  </page>
  <page>
    <title>Головна сторінка</title>
    <id>3</id>
    <restrictions>edit=sysop:move=sysop</restrictions>
    <revision>
      <id>3793739</id>
      <timestamp>2010-02-18T18:35:03Z</timestamp>
      <contributor>
        <username>Tomahiv</username>
        <id>4232</id>
      </contributor>
      <minor />
      <comment>менше за розміром, ніж цікавинки. Цікавинки інформаційно вартісніші</comment>
      <text xml:space="preserve">&lt;div style=&quot;font-family:Verdana, Arial, Helvetica, sans-serif;&quot;&gt;
{{Головна стаття/Голова}}
{|width=&quot;100%&quot; border=&quot;0&quot; cellspacing=&quot;0&quot; cellpadding=&quot;0&quot; style=&quot;margin-top:0.8em;margin-bottom:0.8em&quot;
|-
| width=&quot;54%&quot; style=&quot;border:0px;&quot;|{{Головна стаття/Пошук та довідка}}
| width=&quot;1%&quot; | &amp;nbsp;
| width=&quot;45%&quot; style=&quot;border:0px;&quot;|{{Головна стаття/Участь та спілкування}}
|}
__NOTOC__ __NOEDITSECTION__
{{MainPageInterwikis}}</text>
    </revision>
  </page>
  <page>
    <title>Wikipedia</title>
    <id>6</id>
    <redirect />
    <revision>
      <id>61125</id>
      <timestamp>2004-08-01T07:40:52Z</timestamp>
      <contributor>
        <username>Maksym Ye.</username>
        <id>76</id>
      </contributor>
      <comment>Перенаправив на Вікіпедія</comment>
      <text xml:space="preserve">#REDIRECT [[Вікіпедія]]</text>
    </revision>
  </page>
</mediawiki>
'''
#xml.sax.parseString(test,handler)
